library(dplyr)
library(ggplot2)
library(tidyverse)
mobiles <- read.csv("mobiles_dataset.csv")
str(mobiles)
## 'data.frame': 925 obs. of 15 variables:
## $ Company.Name : chr "Apple" "Apple" "Apple" "Apple" ...
## $ Model.Name : chr "iPhone 16 128GB" "iPhone 16 256GB" "iPhone 16 512GB" "iPhone 16 Plus 128GB" ...
## $ Mobile.Weight : chr "174g" "174g" "174g" "203g" ...
## $ RAM : chr "6GB" "6GB" "6GB" "6GB" ...
## $ Front.Camera : chr "12MP" "12MP" "12MP" "12MP" ...
## $ Back.Camera : chr "48MP" "48MP" "48MP" "48MP" ...
## $ Processor : chr "A17 Bionic" "A17 Bionic" "A17 Bionic" "A17 Bionic" ...
## $ Battery.Capacity.mAh : int 3600 3600 3600 4200 4200 4200 4400 4400 4400 4500 ...
## $ Screen.Size.inches : num 6.1 6.1 6.1 6.7 6.7 6.7 6.1 6.1 6.1 6.7 ...
## $ Launched.Price.Pakistan.PKR: int 224999 234999 244999 249999 259999 274999 284999 294999 314999 314999 ...
## $ Launched.Price.India.INR : int 79999 84999 89999 89999 94999 104999 99999 104999 114999 109999 ...
## $ Launched.Price.China.CNY : int 5799 6099 6499 6199 6499 6999 6999 7099 7499 7499 ...
## $ Launched.Price.USA.USD : int 799 849 899 899 949 999 999 1049 1099 1099 ...
## $ Launched.Price.Dubai.AED : int 2799 2999 3199 3199 3399 3599 3499 3699 3899 3799 ...
## $ Launched.Year : int 2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
mobiles <- read.csv("mobiles_dataset.csv")
mobiles$Pakistan_USD <- mobiles$Launched.Price.Pakistan.PKR * 0.0036
mobiles$India_USD <- mobiles$Launched.Price.India.INR * 0.011
mobiles$China_USD <- mobiles$Launched.Price.China.CNY * 0.14
mobiles$Dubai_USD <- mobiles$Launched.Price.Dubai.AED * 0.27
mobiles$USA_USD <- mobiles$Launched.Price.USA.USD
price_columns <- c("Pakistan_USD", "India_USD", "China_USD", "USA_USD", "Dubai_USD")
correlation <- cor(mobiles$Battery.Capacity.mAh, mobiles[, price_columns])
print(correlation)
## Pakistan_USD India_USD China_USD USA_USD Dubai_USD
## [1,] -0.06091272 -0.01905429 -0.04104334 -0.0411368 -0.04890776
As we see, the correlations tell that the battery sizes barely affect the pricing of the phones.
ggplot(mobiles, aes(x = Battery.Capacity.mAh, y = USA_USD)) +
geom_point() +
labs(title = "Battery Capacity vs Price (USD)", x = "Battery Capacity (mAh)", y = "Price (USD)")
The price range for phones with a battery capacity of 1000-6000 mAh is comparable to or sometimes even higher than the price range of phones with a 7000+ mAh battery capacity, therefore I doubt any influence of battery size on the price, but let’s analyze other visualizations as well
mobiles$Price_Category <- cut(mobiles$USA_USD, breaks = c(0, 250, 450, 650, 800, 1100, 1500, 2000, 3000),
labels = c("<$250", "$250-450", "$450-650", "$650-800", "$800-1100", "$1100-1500", "$1500-2000", "$2000+"))
ggplot(mobiles, aes(x = Battery.Capacity.mAh)) +
geom_histogram(bins = 20, fill = "turquoise", color = "black") +
labs(title = "Distribution of Battery Capacity", x = "Battery Capacity", y = "Count") +
theme_bw()
# Faceted histograms of battery capacities by price category
ggplot(mobiles, aes(x = Battery.Capacity.mAh, fill = Price_Category)) +
geom_histogram(bins = 20, color = "black") +
facet_wrap(~ Price_Category, scales = "free_y") +
labs(title = "Battery Capacity Distribution by Price Category", x = "Battery Capacity (mAh)", y = "Count") +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(legend.position = "none")
Summary <- tapply(mobiles$Battery.Capacity.mAh, mobiles$Price_Category, mean)
print(Summary)
## <$250 $250-450 $450-650 $650-800 $800-1100 $1100-1500 $1500-2000
## 4977.500 5169.480 5088.536 4901.561 4803.190 4854.415 5456.706
## $2000+
## 4783.333
Overall_AVG <- mean(Summary)
cat("\nThe average of all the price category averages is : ", Overall_AVG)
##
## The average of all the price category averages is : 5004.34
For each price range, both the facet of histograms (comparison of the distributions) and the information about the means of each price category show that the average battery size does not change that much with the increase of price. The value of a couple of hundred mAh-s is not signifficant in my personal observations, experience and knowledge gained from professional phone reviewers. Basically, no phone manufacturer in the competitive market wants to sacrifice the batary capacity for other materials, because it is a huge part of the user experience. Additionally, since there are outlandish battery sizes (~9000mah) both in 650-800USD category and in the flagship category $1100-2000, I see no signifficant increase in value even from these big battery size outliers.
mobiles$RAM_GB <- as.numeric(regmatches(mobiles$RAM, regexpr("^\\d+", mobiles$RAM)))
# removing the Gigabyte letters and converting the type from string to numeric and creating a separate column
# Checking the correlation between RAM and prices
# (correlation shows how one variable affects the other, so this, just like before is important here):
price_columns <- c("Pakistan_USD", "India_USD", "China_USD", "USA_USD", "Dubai_USD")
correlation <- round(cor(mobiles$RAM_GB, mobiles[, price_columns]),4)
print(correlation)
## Pakistan_USD India_USD China_USD USA_USD Dubai_USD
## [1,] 0.409 0.4162 0.4217 0.462 0.473
ggplot(mobiles, aes(x = RAM_GB, y = scale(USA_USD))) +
geom_point(alpha = 0.5) +
labs(title = "RAM Size vs. Scaled Price", x = "RAM (GB)", y = "Scaled Price (Z-score)") +
scale_x_continuous(limits = c(0, 18)) +
theme_bw()
ram_by_price <- aggregate(RAM_GB ~ Price_Category, data = mobiles, FUN = mean)
print(ram_by_price)
## Price_Category RAM_GB
## 1 <$250 5.025641
## 2 $250-450 7.799213
## 3 $450-650 8.744000
## 4 $650-800 9.292683
## 5 $800-1100 8.936620
## 6 $1100-1500 9.938462
## 7 $1500-2000 11.529412
## 8 $2000+ 10.666667
price_by_ram <- aggregate(cbind(Pakistan_USD, India_USD, China_USD, Dubai_USD, USA_USD) ~ RAM_GB, data = mobiles, FUN = mean)
price_by_ram <- price_by_ram[price_by_ram$RAM_GB <= 16, ]
ggplot(price_by_ram %>%
pivot_longer(-RAM_GB, names_to = "Region", values_to = "Price") %>%
mutate(Region = gsub("_USD", "", Region)),
aes(x = factor(RAM_GB), y = Price, fill = Region)) +
geom_bar(stat = "identity", position = position_dodge()) +
scale_fill_brewer(palette = "Set1") +
labs(title = "Average Price of Smartphones by RAM Size", x = "RAM (GB)", y = "Average Price (USD)", fill = "Region") +
theme_bw()
ggplot(mobiles, aes(x = Price_Category, y = RAM_GB)) +
geom_boxplot(fill = "turquoise") +
labs(title = "RAM Distribution by Price Category", x = "Price Category", y = "RAM (GB)") +
scale_y_continuous(limits = c(0, 18)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
RAM size and price show a clear link which’s somewhat strong (~0.44). Looking at the charts, we can see this link is stronger in phones that cost between $0 and $650-$800, where the line goes up steadily. For more expensive phones, the link gets weaker, and RAM stops growing even as prices go up. This happens because more RAM stops making phones much better after a point. From what I know, phone makers start to improve their software instead once RAM is good enough. Adding too much RAM costs more money but doesn’t help much (it’s like making a teapot too big when only a few people will drink from it).
mobiles <- mobiles %>%
rowwise() %>%
mutate(Price_CV = sd(c(Pakistan_USD, India_USD, China_USD, Dubai_USD, USA_USD)) / mean(c(Pakistan_USD, India_USD, China_USD, Dubai_USD, USA_USD)) * 100,
Price_Range_Pct = ((max(c(Pakistan_USD, India_USD, China_USD, Dubai_USD, USA_USD)) - min(c(Pakistan_USD, India_USD, China_USD, Dubai_USD, USA_USD))) / USA_USD) * 100)
top_brands <- c("Apple", "Samsung", "Google", "Huawei", "Xiaomi", "Oppo", "Vivo", "OnePlus", "Realme", "Honor")
brand_stats <- mobiles %>%
dplyr::filter(Company.Name %in% top_brands) %>%
dplyr::group_by(Company.Name) %>%
dplyr::summarize(
Mean_CV = mean(Price_CV),
SD_CV = sd(Price_CV),
Count = n())
ggplot(brand_stats, aes(x = reorder(Company.Name, -Mean_CV), y = Mean_CV, fill = ifelse(Company.Name == "Apple", "Apple", "Other"))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Average Price Variation by Brand", x = "Brand", y = "Mean Coefficient of Variation (%)") +
scale_fill_manual(values = c("Apple" = "pink", "Other" = "turquoise"), name = "Brand Type") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
top_brands <- c("Apple", "Samsung", "Google", "Huawei", "Xiaomi", "Oppo", "Vivo", "OnePlus", "Realme", "Honor")
price_ratios <- mobiles %>%
dplyr::group_by(Company.Name) %>%
dplyr::filter(!is.na(Company.Name)) %>%
dplyr::summarize(
Pak_vs_US = mean(Pakistan_USD / USA_USD * 100, na.rm = TRUE),
India_vs_US = mean(India_USD / USA_USD * 100, na.rm = TRUE),
China_vs_US = mean(China_USD / USA_USD * 100, na.rm = TRUE),
Dubai_vs_US = mean(Dubai_USD / USA_USD * 100, na.rm = TRUE),
Model_Count = n()) %>%
dplyr::filter(Company.Name %in% top_brands)
price_ratios <- mobiles %>%
dplyr::group_by(Company.Name) %>%
dplyr::summarize(Pak_vs_US = mean(Pakistan_USD / USA_USD * 100),
India_vs_US = mean(India_USD / USA_USD * 100),
China_vs_US = mean(China_USD / USA_USD * 100),
Dubai_vs_US = mean(Dubai_USD / USA_USD * 100),
Model_Count = n()) %>%
dplyr::filter(Company.Name %in% top_brands)
price_ratios_long <- price_ratios %>% # Reshaping price_rations for correct dodge bar charts, since errors are unpleasant at 4:55AM
pivot_longer(cols = c(Pak_vs_US, India_vs_US, China_vs_US, Dubai_vs_US),
names_to = "Region", values_to = "Ratio") %>%
mutate(Region = factor(Region, levels = c("Pak_vs_US", "India_vs_US", "China_vs_US", "Dubai_vs_US"),
labels = c("Pakistan", "India", "China", "Dubai")))
# Dodge bar chart will show regional price differences
ggplot(price_ratios_long, aes(x = Region, y = Ratio - 100, fill = Company.Name)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Regional Price Differences Compared to US Price",
subtitle = "Positive values indicate higher prices than US",
x = "Region",
y = "% Difference from US Price") +
scale_fill_brewer(palette = "Paired") +
theme_minimal() +
facet_wrap(~Company.Name, scales = "free_y") +
theme(axis.text.x = element_text(size = 8))
price_data_long <- mobiles %>%
select(Company.Name, Pakistan_USD, India_USD, China_USD, Dubai_USD, USA_USD) %>%
pivot_longer(cols = c(Pakistan_USD, India_USD, China_USD, Dubai_USD, USA_USD),
names_to = "Region", values_to = "Price") %>%
mutate(Region = gsub("_USD", "", Region), Apple = ifelse(Company.Name == "Apple", "Apple", "Other Brands"))
ggplot(price_data_long, aes(x = Price, fill = Apple)) +
geom_histogram(bins = 20, position = "identity") +
facet_grid(Region ~ Apple, scales = "free_y") +
labs(title = "Price Distribution by Region: Apple vs. Other Brands",x = "Price (USD)", y = "Count") +
scale_fill_manual(values = c("Apple" = "red", "Other Brands" = "turquoise")) +
theme_bw()
apple_data <- mobiles %>% # price differences from US price
filter(Company.Name == "Apple") %>%
mutate(Pak_Diff = (Pakistan_USD - USA_USD) / USA_USD * 100,
India_Diff = (India_USD - USA_USD) / USA_USD * 100,
China_Diff = (China_USD - USA_USD) / USA_USD * 100,
Dubai_Diff = (Dubai_USD - USA_USD) / USA_USD * 100)
non_apple_data <- mobiles %>%
filter(Company.Name != "Apple") %>%
mutate(Pak_Diff = (Pakistan_USD - USA_USD) / USA_USD * 100,
India_Diff = (India_USD - USA_USD) / USA_USD * 100,
China_Diff = (China_USD - USA_USD) / USA_USD * 100,
Dubai_Diff = (Dubai_USD - USA_USD) / USA_USD * 100)
diff_data <- bind_rows(apple_data %>%
select(Company.Name, Pak_Diff, India_Diff, China_Diff, Dubai_Diff) %>%
mutate(Type = "Apple"),
non_apple_data %>% select(Company.Name, Pak_Diff, India_Diff, China_Diff, Dubai_Diff) %>%
mutate(Type = "Other Brands")) %>%
pivot_longer(cols = c(Pak_Diff, India_Diff, China_Diff, Dubai_Diff),
names_to = "Region", values_to = "Percent_Diff") %>%
mutate(Region = gsub("_Diff", "", Region))
ggplot(diff_data, aes(x = Percent_Diff, fill = Type)) +
geom_density(alpha = 0.5) +
facet_wrap(~ Region, scales = "free_y") +
labs(title = "Distribution of Price Differences from US Price",
subtitle = "Apple vs. Other Brands", x = "Percentage Difference from US Price",
y = "Density") +
scale_fill_manual(values = c("Apple" = "salmon", "Other Brands" = "turquoise")) +
theme_bw() +
xlim(-50, 50)
## Warning: Removed 65 rows containing non-finite outside the scale range
## (`stat_density()`).
stable_brands <- mobiles %>%
group_by(Company.Name) %>%
summarize(
Avg_CV = mean(Price_CV),
Model_Count = n()) %>%
filter(Model_Count >= 5) %>%
arrange(Avg_CV) %>%
head(10)
ggplot(stable_brands, aes(x = reorder(Company.Name, -Avg_CV), y = Avg_CV)) +
geom_bar(stat = "identity", fill = ifelse(stable_brands$Company.Name == "Apple", "salmon", "turquoise")) +
coord_flip() +
labs(title = "Top 10 Brands with Most Stable Global Pricing",
subtitle = "Lower values indicate more consistent pricing across regions",
x = "Brand",
y = "Average Coefficient of Variation (%)") +
theme_bw()
Conclusions: Apple devices actually have lower price variation across regions compared to most other brands. Looking at 1st and 5th plots, Apple shows a coefficient of variation around 11%, which is lower than brands like Motorola, Honor, Tecno, Samsung, Google, Sony, and Xiaomi. Only Realme shows more stable global pricing than Apple. According to the US comparison facet plot, Apple devices appear to have the highest markup in India, where they’re priced approximately 10% higher than US prices. This contrasts with Pakistan where Apple devices are around 15% cheaper than US prices. Realme stands out as having the most stable pricing across regions with the lowest coefficient of variation (around 6-7%), making it the most consistent global pricing strategy among all brands shown. Nokia also shows relatively stable pricing compared to many other brands. In contrast, brands like Motorola, Honor, and Tecno show much higher price variations across regions, with coefficients of variation ranging from 15-17%.
region_prices <- data.frame(
Region = c("Pakistan", "India", "China", "USA", "Dubai"),
Price_USD = c(
round(mean(mobiles$Pakistan_USD, na.rm = TRUE)),
round(mean(mobiles$India_USD, na.rm = TRUE)),
round(mean(mobiles$China_USD, na.rm = TRUE)),
round(mean(mobiles$USA_USD, na.rm = TRUE)),
round(mean(mobiles$Dubai_USD, na.rm = TRUE))))
region_prices <- region_prices %>%
arrange(desc(Price_USD)) %>%
mutate(Region = factor(Region, levels = Region)) # Sorting regions by price
top_brands <- names(which(table(mobiles$Company.Name) >= 10))
brand_region_price <- data.frame()
for (brand in top_brands) {
brand_data <- mobiles[mobiles$Company.Name == brand, ]
brand_region_price <- rbind(brand_region_price,
data.frame(
Company.Name = brand,
Region = "Pakistan", Price_USD = mean(brand_data$Pakistan_USD, na.rm = TRUE),
Region = "India", Price_USD = mean(brand_data$India_USD, na.rm = TRUE),
Region = "China", Price_USD = mean(brand_data$China_USD, na.rm = TRUE),
Region = "USA", Price_USD = mean(brand_data$USA_USD, na.rm = TRUE),
Region = "Dubai", Price_USD = mean(brand_data$Dubai_USD, na.rm = TRUE)))}
brand_price_range <- data.frame()
for (brand in top_brands) {
brand_prices <- c(
mean(mobiles$Pakistan_USD[mobiles$Company.Name == brand], na.rm = TRUE),
mean(mobiles$India_USD[mobiles$Company.Name == brand], na.rm = TRUE),
mean(mobiles$China_USD[mobiles$Company.Name == brand], na.rm = TRUE),
mean(mobiles$USA_USD[mobiles$Company.Name == brand], na.rm = TRUE),
mean(mobiles$Dubai_USD[mobiles$Company.Name == brand], na.rm = TRUE))
regions <- c("Pakistan", "India", "China", "USA", "Dubai")
min_idx <- which.min(brand_prices)
max_idx <- which.max(brand_prices)
brand_price_range <- rbind(brand_price_range, data.frame(
Company.Name = brand,
Min_Region = regions[min_idx],
Min_Price = brand_prices[min_idx],
Max_Region = regions[max_idx],
Max_Price = brand_prices[max_idx],
Price_Range_Pct = (brand_prices[max_idx] - brand_prices[min_idx]) / brand_prices[min_idx] * 100))}
brand_price_range <- brand_price_range %>%
arrange(desc(Price_Range_Pct)) # Sorting by price range percentage
top_10_brands <- names(sort(table(mobiles$Company.Name), decreasing = TRUE)[1:10])
heatmap_data <- data.frame()
for (brand in top_10_brands) {
brand_avg <- mean(c(
mean(mobiles$Pakistan_USD[mobiles$Company.Name == brand], na.rm = TRUE),
mean(mobiles$India_USD[mobiles$Company.Name == brand], na.rm = TRUE),
mean(mobiles$China_USD[mobiles$Company.Name == brand], na.rm = TRUE),
mean(mobiles$USA_USD[mobiles$Company.Name == brand], na.rm = TRUE),
mean(mobiles$Dubai_USD[mobiles$Company.Name == brand], na.rm = TRUE)), na.rm = TRUE)
pak_diff <- (mean(mobiles$Pakistan_USD[mobiles$Company.Name == brand], na.rm = TRUE) - brand_avg) / brand_avg * 100
ind_diff <- (mean(mobiles$India_USD[mobiles$Company.Name == brand], na.rm = TRUE) - brand_avg) / brand_avg * 100
chn_diff <- (mean(mobiles$China_USD[mobiles$Company.Name == brand], na.rm = TRUE) - brand_avg) / brand_avg * 100
usa_diff <- (mean(mobiles$USA_USD[mobiles$Company.Name == brand], na.rm = TRUE) - brand_avg) / brand_avg * 100
dub_diff <- (mean(mobiles$Dubai_USD[mobiles$Company.Name == brand], na.rm = TRUE) - brand_avg) / brand_avg * 100
heatmap_data <- rbind(heatmap_data,
data.frame(Company.Name = brand, Region = "Pakistan", Price_Difference_Pct = pak_diff),
data.frame(Company.Name = brand, Region = "India", Price_Difference_Pct = ind_diff),
data.frame(Company.Name = brand, Region = "China", Price_Difference_Pct = chn_diff),
data.frame(Company.Name = brand, Region = "USA", Price_Difference_Pct = usa_diff),
data.frame(Company.Name = brand, Region = "Dubai", Price_Difference_Pct = dub_diff)
)
}
ggplot(region_prices, aes(x = Region, y = Price_USD, fill = Region)) +
geom_bar(stat = "identity", width = 0.8) +
geom_text(aes(label = sprintf("$%d", Price_USD)), vjust = -0.5, size = 4, fontface = "bold") +
labs(title = "Average Smartphone Price by Region", x = NULL, y = "Average Price (USD)") +
theme_bw() +
theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))
ggplot(head(brand_price_range, 10),
aes(x = reorder(Company.Name, Price_Range_Pct), y = Price_Range_Pct, fill = Price_Range_Pct)) +
geom_bar(stat = "identity") +
geom_text(aes(label = sprintf("%.0f%%", Price_Range_Pct)), hjust = -0.1) +
geom_text(aes(y = Price_Range_Pct/2, label = sprintf("%s vs %s", Max_Region, Min_Region)),
hjust = 0.5, color = "white", size = 3) +
coord_flip() +
labs(title = "Brands with Largest Regional Price Differences", x = "", y = "Price Difference (%)") +
theme_bw() +
theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
legend.position = "none")
ggplot(heatmap_data, aes(x = Region, y = Company.Name, fill = Price_Difference_Pct)) +
geom_tile() +
geom_text(aes(label = sprintf("%.0f%%", Price_Difference_Pct)), size = 3) +
scale_fill_gradient2(low = "green", mid = "white", high = "red", midpoint = 0,
limits = c(-40, 40), oob = scales::squish) +
labs(title = "Regional Price Variations", x = "", y = "") +
theme_bw() +
theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1))
“Q5 conclusions:
ggplot(region_prices, aes(x = Region, y = Price_USD, fill = Region)) +
geom_bar(stat = "identity", width = 0.8) +
geom_text(aes(label = sprintf("$%d", Price_USD)), vjust = -0.5, size = 4, fontface = "bold") +
labs(title = "Average Smartphone Price by Region", x = NULL, y = "Average Price (USD)") +
theme_bw() +
theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))
company_order <- c("Apple", "Google", "Honor", "Huawei", "Infinix", "IQOO", "Lenovo",
"Motorola", "Nokia", "OnePlus", "Oppo", "Poco", "POCO", "Realme",
"Samsung", "Sony", "Tecno", "Vivo", "Xiaomi")
plot_data <- mobiles %>%
dplyr::filter(!is.na(mobiles$USA_USD)) %>% # I get an error some-why if I don't mention "dplyr::"
dplyr::filter(Company.Name %in% company_order) %>%
dplyr::mutate(Company.Name = factor(Company.Name, levels = company_order))
manual_colors <- c("Apple" = "#F07369",
"Google" = "#F5795F",
"Honor" = "#E48800",
"Huawei" = "#BC9D01",
"Infinix" = "#9CA600",
"IQOO" = "#68A307",
"Lenovo" = "#02B913",
"Motorola" = "#03BD62",
"Nokia" = "#07BC8C",
"OnePlus" = "#00C0B3",
"Oppo" = "#03BDD4",
"Poco" = "#05A6FF",
"POCO" = "#05A6FF",
"Realme" = "#7E97FF",
"Samsung" = "#B37CF1",
"Sony" = "#E36DF7",
"Tecno" = "#F863E0",
"Vivo" = "#FE63BE",
"Xiaomi" = "#FF6A9A")
ggplot(plot_data, aes(x = Company.Name, y = USA_USD, fill = Company.Name)) +
geom_boxplot() +
geom_jitter(width = 0.2, size = 0.8, color = "black", alpha = 0.7) +
labs(title = "Price Distribution by Company in USA",
subtitle = "A boxplot showing how the price varies by company, with individual data points overlaid", x = "Company", y = "Price in USD") +
scale_y_continuous(limits = c(100, 2700)) +
scale_fill_manual(values = manual_colors) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right",
legend.title = element_text(face = "bold"),
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(face = "italic", size = 9))
ggplot(plot_data, aes(x = Battery.Capacity.mAh, y = USA_USD, color = Company.Name, size = Screen.Size.inches)) +
geom_point() +
labs(title = "Battery Capacity vs. Price in USA", subtitle = "The relationship between battery capacity, price, and screen size across different smartphone brands",
x = "Battery Capacity", y = "Price", color = "Brand") +
scale_x_continuous(limits = c(1500, 11000)) +
scale_y_continuous(limits = c(0, 2800)) +
scale_color_manual(values = manual_colors) +
guides(size = "none") +
theme_minimal() +
theme(legend.position = "right",
legend.title = element_text(face = "bold"),
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(face = "italic", size = 9))
ggplot(plot_data, aes(x = Battery.Capacity.mAh, y = USA_USD, shape = Company.Name, color = Screen.Size.inches, size = Screen.Size.inches)) +
geom_point() +
labs(title = "Battery Capacity vs. Price for Top 5 Brands", subtitle = "Different Shapes for Each Brand, Color by Screen Size (USA)",
x = "Battery Capacity (mAh)", y = "Price (USD)", shape = "Brand", color = "Screen Size") +
scale_x_continuous(limits = c(2000, 10500)) +
scale_y_continuous(limits = c(100, 2000)) +
scale_shape_manual(values = c("Apple" = 16, "Honor" = 17, "Oppo" = 18, "Samsung" = 15, "Vivo" = 16)) +
scale_color_continuous() +
guides(size = "none", color = "none") +
theme_minimal() +
theme(legend.position = "right",
legend.title = element_text(face = "bold"),
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(face = "italic", size = 8))
mobiles$Weight_g <- as.numeric(gsub("[^0-9]", "", mobiles$Mobile.Weight))
mobiles$Weight_Cat <- ifelse(mobiles$Weight_g >= 210, "Heavy (≥210g)",
ifelse(mobiles$Weight_g < 170, "Light (<170g)", "Medium"))
mobiles$Screen_Cat <- ifelse(mobiles$Screen.Size.inches >= 7, "Large (≥7\")",
ifelse(mobiles$Screen.Size.inches < 6, "Small (<6\")", "Medium"))
# Defining a combined category for form factor
mobiles$Form_Factor <- paste(mobiles$Weight_Cat, "/", mobiles$Screen_Cat)
# avg price by form factor
form_price <- aggregate(USA_USD ~ Form_Factor,
data = mobiles[!is.na(mobiles$Form_Factor) & !is.na(mobiles$USA_USD),],
FUN = function(x) c(mean = mean(x), count = length(x)))
form_price <- data.frame(Form_Factor = form_price$Form_Factor, Average_Price = unlist(form_price$USA_USD[,1]), Count = unlist(form_price$USA_USD[,2]))
form_price <- form_price[form_price$Count >= 10,]
form_price <- form_price[order(-form_price$Average_Price),] # Sorting
ggplot(form_price, aes(x = reorder(Form_Factor, Average_Price), y = Average_Price)) +
geom_bar(stat = "identity", fill = "coral", width = 0.7) +
# Place all labels consistently at the end of each bar
geom_text(aes(label = sprintf("$%d", round(Average_Price))), hjust = -0.1, size = 5) +
coord_flip() +
# Extend the x-axis to make room for labels
scale_y_continuous(limits = c(0, max(form_price$Average_Price) * 1.2)) +
labs(title = "Form Factor Impact on Smartphone Prices", x = "", y = "Average Price (USD)") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size = 14))